{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "b7bae21f",
   "metadata": {},
   "source": [
    "### Test code for writing variants to bed file "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "020380e9",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd \n",
    "import pysam \n",
    "import argparse"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "88f5c0b8",
   "metadata": {},
   "outputs": [],
   "source": [
    "vcf_path=\"/lustre/scratch126/humgen/projects/interval_wgs/final_release_freeze/gt_phased/interval_wgs.chr21.gt_phased.vcf.gz\"\n",
    "max_indel_length = 50"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f7a6e0b5",
   "metadata": {},
   "outputs": [],
   "source": [
    "vcf = pysam.VariantFile(vcf_path, mode = \"r\")\n",
    "records = vcf.fetch()\n",
    "for rec in records: \n",
    "    af = rec.info[\"AF\"]\n",
    "    # get chromosome, position, reference and alternative alleles \n",
    "    chrom, pos, ref, alt_alleles = rec.chrom, rec.pos, rec.ref, rec.alts\n",
    "    chrom_num = chrom.split(\"chr\")[1]\n",
    "    # check for multiallelic alleles\n",
    "    if len(alt_alleles) > 1: \n",
    "        raise ValueError(f\"Multiallelic entry in vcf at: {chrom}, {pos}\")\n",
    "    else: \n",
    "        alt = alt_alleles[0]\n",
    "    vcf_vrnt_id = f\"{rec.chrom}:{rec.pos}:{ref}:{alt}\"\n",
    "    # check ref and alt alleles do not contain N or . \n",
    "    if any(b not in \"ATGC\" for b in ref+alt): \n",
    "        raise ValueError(f\"Variant {vcf_vrnt_id} contains unassigned nucleotides.\")\n",
    "    # convert to bed \n",
    "    # deletions (vcf2bed includes base preceding deletion)\n",
    "    if len(ref) > len(alt):\n",
    "        if len(alt) > 1: \n",
    "            raise ValueError(f\"Variant {vcf_vrnt_id} alt allele length is greater than one: {len(alt)}\") \n",
    "        start = pos - 1 \n",
    "        end = pos+len(ref) - 1\n",
    "        length = len(ref) - len(alt)\n",
    "    # snvs and insertions (vcf2bed yields a one-base bed element)\n",
    "    else: \n",
    "        start = pos - 1 \n",
    "        end = pos\n",
    "        length = len(alt) - len(ref)\n",
    "    # check length \n",
    "    if length < 0: \n",
    "        raise ValueError(f\"Negative variant length: {vcf_vrnt_id}\")\n",
    "    # remove indels longer than max length  \n",
    "    elif length > max_indel_length: \n",
    "        continue \n",
    "    else: \n",
    "        print(chrom_num, start, end, vcf_vrnt_id, af)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a5e57992",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
